*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

* This do file calls Python Geopands to overlay the ZCTA shapefile with the
* prime location shapefile and compute the share of area covered by prime locations. 
* It also saves the geographic area of ZCTAs

* Generate subfolders for outputs
	capture mkdir "$temp/data"
	capture mkdir "$temp/data/US-CBSAs"


python: 

import geopandas as gpd
import pandas as pd
import os
from tqdm import tqdm

# Set the root directory to the Stata working directory
root_dir = os.getcwd()

# Define file paths relative to the root directory
zcta_shapefile = os.path.join(root_dir, '_Data', 'US METRO DATA', 'GIS Data', 'County Business Pattern Boundaries', '2020', 'tl_2020_us_zcta520', 'tl_2020_us_zcta520.shp')
pl_shapefile = os.path.join(root_dir, '_Work', 'Output', 'Shapes', 'US', 'PL_all_USCBSAs.shp')
output_csv = os.path.join(root_dir, '_Work', 'TEMP', 'Data', 'US-CBSAs', 'ZCTA_PLshare.csv')

# Ensure output directory exists
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Load shapefiles
try:
    zcta_gdf = gpd.read_file(zcta_shapefile)
    pl_gdf = gpd.read_file(pl_shapefile)
except Exception as e:
    print(f"Error loading shapefiles: {e}")
    exit()

# Check and reproject CRS if necessary
if zcta_gdf.crs.is_geographic:
    print("Reprojecting ZCTA data to EPSG:3857 for area calculation.")
    zcta_gdf = zcta_gdf.to_crs("EPSG:3857")

if pl_gdf.crs != zcta_gdf.crs:
    print("Reprojecting prime locations data to match ZCTA CRS.")
    pl_gdf = pl_gdf.to_crs(zcta_gdf.crs)

# Calculate area of each ZCTA
zcta_gdf['ZCTA_area_km2'] = zcta_gdf.geometry.area / 1e6

# Initialize a list to store results
results = []

# Loop over each ZCTA polygon with a progress bar
for idx, zcta in tqdm(zcta_gdf.iterrows(), total=zcta_gdf.shape[0], desc="Processing ZCTAs"):
    intersection = gpd.overlay(gpd.GeoDataFrame([zcta], crs=zcta_gdf.crs), pl_gdf, how='intersection')
    
    # Calculate the area of the intersection
    intersection_area_km2 = intersection.geometry.area.sum() / 1e6
    
    # Calculate the share of the ZCTA covered by prime locations
    coverage_share = intersection_area_km2 / zcta['ZCTA_area_km2'] if zcta['ZCTA_area_km2'] > 0 else 0
    
    # Store the results
    results.append({
        'ZCTA': zcta['ZCTA5CE20'],  # Replace with the correct column name for ZCTA identifier if different
        'ZCTA_area_km2': zcta['ZCTA_area_km2'],
        'PL_coverage_km2': intersection_area_km2,
        'PL_coverage_share': coverage_share
    })

# Convert results to a DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")


end

* Script ends